Dataset Name: Monitoring of Federal Criminal Sentences
Year: 2018-2019
Source: United States Sentencing Commission Link to Data
Summary and Previous Work: The data files encompass all records received by the United States Sentencing Commission with sentencing dates falling within the period from October 1, 2018, to September 30, 2019. This data set has been cited a total of 11 times most of which focus on analyzing the disparities among race and gender using techniques such as random coefficient models, logistic regression and ordinary least squares regression. We are hoping to expand on this knowledge by focusing on sentence lengths of drug related crimes.
Question: What factors affect the length of incarceration, in months, for all drug trafficking offenses?
Variables:
#library(dplyr)
#dat2019 = read.csv('C:/Users/cakus/Downloads/37990-0001-Data.tsv', sep = '\t', header = T)
#library("readr")
library(tidyverse)
library(dplyr)
library(ggplot2)
library(plotly)
#write.csv(dat2019, "Sentencing-Data_Imported.csv")
dat2019 = read.csv("Sentencing_Data_Imported.csv")
overall2019 = dat2019 %>%
dplyr::select(SENTTCAP, OFFGUIDE, AGE, WGT1, BASEHI, MONSEX, COMBDRG2,
CITIZEN, NEWEDUC, WEAPON, ACCCAT, DISTRICT, AMENDYR, IS924C, MONACCEP,
NEWCNVTN, NUMDEPEN, DRUGMIN, XCRHISSR, MONRACE, HISPORIG)
#colnames(dat2019)
filter2019 = filter(overall2019,
XCRHISSR == 6,
COMBDRG2 == 1 | COMBDRG2 == 2 | COMBDRG2 == 3 | COMBDRG2 == 4 | COMBDRG2 == 5 | COMBDRG2 == 6 | COMBDRG2 == 8,
WGT1 <= 10000,
OFFGUIDE == 10) %>%
na.omit()
filter2019$NEWCNVTN = factor(filter2019$NEWCNVTN,
labels=c("Plea Deal","Trial"))
ggplot(filter2019,aes(x=NEWCNVTN,y=SENTTCAP, fill=NEWCNVTN)) +
geom_boxplot(outlier.size=2) +
labs(title="Boxplot of Sentence Length and Plea/Trial",
x="Plea/Trial", y="Sentence Length") +
theme(plot.title=element_text(hjust = 0.5))
#FIVE NUMBER SUMMARY with mean
filter2019 %>%
group_by(NEWCNVTN) %>%
summarise(min = fivenum(SENTTCAP)[1],
Q1 = fivenum(SENTTCAP)[2],
median = fivenum(SENTTCAP)[3],
Q3 = fivenum(SENTTCAP)[4],
max = fivenum(SENTTCAP)[5],
mean = mean (SENTTCAP))
ggplot(filter2019, aes(y=SENTTCAP, x=WGT1, color=COMBDRG2)) +
geom_point() +
labs(title="Sentence Length vs Weight", y="Sentence Length",
x="Weight of Drug") +
theme(plot.title=element_text(hjust = 0.5)) +
guides(col=guide_legend("Type of Drug"))
ggplot(filter2019, aes(y=SENTTCAP, x=WGT1, color=NEWCNVTN)) +
geom_point() +
labs(title="Sentence Length vs Weight, Grouped by Plea/Trial",
y="Sentence Length",
x="Weight of Drug") +
theme(plot.title=element_text(hjust = 0.5)) +
guides(col=guide_legend("Plea/Trial"))
filter2019$NEWEDUC = factor(filter2019$NEWEDUC,
labels=c("Less than HS Grad","HS Grad","Some College","College Grad"))
ggplot(filter2019,aes(x=NEWEDUC,y=SENTTCAP, fill=NEWEDUC)) +
geom_boxplot(outlier.size=2) +
labs(title="Boxplot of Sentence Length and Education Level",
x="Education Level", y="Sentence Length") +
theme(plot.title=element_text(hjust = 0.5))
#FIVE NUMBER SUMMARY with mean
filter2019 %>%
group_by(NEWEDUC) %>%
summarise(min = fivenum(SENTTCAP)[1],
Q1 = fivenum(SENTTCAP)[2],
median = fivenum(SENTTCAP)[3],
Q3 = fivenum(SENTTCAP)[4],
max = fivenum(SENTTCAP)[5],
mean = mean (SENTTCAP))
filter2019$IS924C = factor(filter2019$IS924C,
labels=c("No","Yes"))
ggplot(filter2019,aes(x=IS924C,y=SENTTCAP, fill=IS924C)) +
geom_boxplot(outlier.size=2) +
labs(title="Boxplot of Sentence Length and Weapon Enhancement Charge",
x="Weapon Enhancement Charge", y="Sentence Length") +
theme(plot.title=element_text(hjust = 0.5))
#FIVE NUMBER SUMMARY with mean
filter2019 %>%
group_by(IS924C) %>%
summarise(min = fivenum(SENTTCAP)[1],
Q1 = fivenum(SENTTCAP)[2],
median = fivenum(SENTTCAP)[3],
Q3 = fivenum(SENTTCAP)[4],
max = fivenum(SENTTCAP)[5],
mean = mean (SENTTCAP))
ggplot(filter2019, aes(y=SENTTCAP, x=AGE, color=COMBDRG2)) +
geom_point() +
labs(title="Sentence Length vs Weight", y="Sentence Length",
x="Age") +
theme(plot.title=element_text(hjust = 0.5)) +
guides(col=guide_legend("Type of Drug"))
ggplot(filter2019, aes(y=SENTTCAP, x=AGE, color=NEWCNVTN)) +
geom_point() +
labs(title="Sentence Length vs Age, Grouped by Plea/Trial",
y="Sentence Length",
x="Age") +
theme(plot.title=element_text(hjust = 0.5)) +
guides(col=guide_legend("Plea/Trial"))
ggplot(filter2019, aes(y=SENTTCAP, x=BASEHI, color=COMBDRG2)) +
geom_point() +
labs(title="Sentence Length vs Weight", y="BASEHI ",
x="Age") +
theme(plot.title=element_text(hjust = 0.5)) +
guides(col=guide_legend("Type of Drug"))
## Histogram of Sentences
ggplot(filter2019, aes(SENTTCAP)) + geom_histogram(color = "darkblue", fill = "lightblue")+
labs(title="Sentence Length Distribution",
x="Sentence Length") +
theme(plot.title=element_text(hjust = 0.5))
ggplot(filter2019,aes(x=factor(MONSEX,levels=c("0","1"),labels=c("Male","Female")),y=SENTTCAP,group=MONSEX))+
geom_boxplot()+
labs(x="Sex",y="Sentence Length")
p=ggplot(filter2019,aes(x=factor(CITIZEN,levels=c("1","2","3","4","5"),
labels=c("US citizen","Legal alien","Illegal alien","Not US citizen","Extradited Alien")),y=SENTTCAP)) +geom_boxplot() +labs(x="Citizenship Status",y="Sentence Length")
ggplotly(p)
ggplot(filter2019,aes(x=factor(MONRACE,levels=c("1","2","3","4","5","9"),labels=c("White","Black","American Indian","Asian/PI","Multi-racial","Non-US Native American")),y=SENTTCAP))+
geom_boxplot()+labs(x="Race",y="Sentence Length")
ggplot(filter2019,aes(x=factor(HISPORIG,levels=c("0","1","2"),labels=c("NA","Non-Hispanic","Hispanic")),y=SENTTCAP,group=HISPORIG))+
geom_boxplot()+
labs(x="Hispanic or Not",y="Sentence Length")
We will be modeling sentence lengths using logistic regression and decision trees. We will use background research and exploratory analysis to find variables that correlate with sentence length and then create several iterations of models and trees and use metrics such as mean square error, adjusted r-squared, AIC and BIC to evaluate the different models.
filter2019$logWGT1=log(filter2019$WGT1)
#Testing and Training Set
sample <- sample(c(TRUE, FALSE), nrow(filter2019), replace=TRUE, prob=c(0.8,0.2))
train <- filter2019[sample, ]
test <- filter2019[!sample, ]
filter2019$COMBDRG2 = as.factor(filter2019$COMBDRG2)
contrasts(filter2019$COMBDRG2)<-matrix(c(1,0,0,0,0, 0,1,0,0,0, 0,0,1,0,0, 0,0,0,0,1),nrow=5)
#First Model with all relevant variables from previous research and EDA
trial <- lm(SENTTCAP ~ AGE + as.factor(MONSEX) +as.factor(MONRACE) + as.factor(NEWEDUC) + logWGT1 + BASEHI + as.factor(IS924C) + as.factor(NEWCNVTN) + as.factor(CITIZEN) + as.factor(MONACCEP) + as.factor(HISPORIG)+ as.factor(COMBDRG2), data = train)
summary(trial)
##
## Call:
## lm(formula = SENTTCAP ~ AGE + as.factor(MONSEX) + as.factor(MONRACE) +
## as.factor(NEWEDUC) + logWGT1 + BASEHI + as.factor(IS924C) +
## as.factor(NEWCNVTN) + as.factor(CITIZEN) + as.factor(MONACCEP) +
## as.factor(HISPORIG) + as.factor(COMBDRG2), data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -195.017 -40.609 -5.218 41.927 282.624
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 47.9287 25.3402 1.891 0.059012 .
## AGE -0.2360 0.2984 -0.791 0.429169
## as.factor(MONSEX)1 -22.6032 12.2198 -1.850 0.064807 .
## as.factor(MONRACE)2 2.4847 6.9175 0.359 0.719572
## as.factor(MONRACE)3 -14.7550 22.7142 -0.650 0.516184
## as.factor(MONRACE)4 11.0435 27.4227 0.403 0.687291
## as.factor(MONRACE)5 24.8883 62.2702 0.400 0.689521
## as.factor(MONRACE)7 -59.8815 46.0976 -1.299 0.194398
## as.factor(MONRACE)8 -65.8381 45.9950 -1.431 0.152790
## as.factor(NEWEDUC)HS Grad 1.1125 5.6672 0.196 0.844431
## as.factor(NEWEDUC)Some College -2.9520 7.5928 -0.389 0.697560
## as.factor(NEWEDUC)College Grad -27.0848 24.0416 -1.127 0.260334
## logWGT1 1.9173 2.5960 0.739 0.460453
## BASEHI 3.4311 0.9376 3.659 0.000273 ***
## as.factor(IS924C)Yes 44.2382 7.8833 5.612 2.97e-08 ***
## as.factor(NEWCNVTN)Trial 77.1794 25.6156 3.013 0.002687 **
## as.factor(CITIZEN)2 -35.4064 46.5793 -0.760 0.447450
## as.factor(CITIZEN)3 6.9984 21.8057 0.321 0.748356
## as.factor(CITIZEN)4 -16.2170 67.9802 -0.239 0.811526
## as.factor(MONACCEP)-2 15.7886 9.8467 1.603 0.109321
## as.factor(MONACCEP)0 77.1568 24.9937 3.087 0.002107 **
## as.factor(HISPORIG)1 -9.8977 14.6106 -0.677 0.498370
## as.factor(HISPORIG)2 -24.2593 16.6223 -1.459 0.144925
## as.factor(COMBDRG2)2 2.4474 11.4143 0.214 0.830288
## as.factor(COMBDRG2)3 0.7152 10.5679 0.068 0.946066
## as.factor(COMBDRG2)4 -17.3749 27.9299 -0.622 0.534100
## as.factor(COMBDRG2)6 17.7573 12.7754 1.390 0.165017
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 61.92 on 652 degrees of freedom
## Multiple R-squared: 0.4004, Adjusted R-squared: 0.3765
## F-statistic: 16.75 on 26 and 652 DF, p-value: < 2.2e-16
#Backwards step regression
trial_step = step(trial, trace = 0)
summary(trial_step)
##
## Call:
## lm(formula = SENTTCAP ~ as.factor(MONSEX) + BASEHI + as.factor(IS924C) +
## as.factor(NEWCNVTN) + as.factor(MONACCEP) + as.factor(HISPORIG),
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -200.857 -41.474 -5.686 43.040 277.601
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.9978 18.5229 1.350 0.177611
## as.factor(MONSEX)1 -21.8132 11.8399 -1.842 0.065866 .
## BASEHI 4.5462 0.4314 10.539 < 2e-16 ***
## as.factor(IS924C)Yes 44.8678 7.6449 5.869 6.89e-09 ***
## as.factor(NEWCNVTN)Trial 85.8001 24.5087 3.501 0.000495 ***
## as.factor(MONACCEP)-2 16.1260 9.4403 1.708 0.088061 .
## as.factor(MONACCEP)0 65.8036 23.7887 2.766 0.005828 **
## as.factor(HISPORIG)1 -5.3649 14.0937 -0.381 0.703578
## as.factor(HISPORIG)2 -20.0518 15.3632 -1.305 0.192278
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 61.7 on 670 degrees of freedom
## Multiple R-squared: 0.3884, Adjusted R-squared: 0.3811
## F-statistic: 53.18 on 8 and 670 DF, p-value: < 2.2e-16
#Variables from Backwards Step Regression
final_model = lm(formula = SENTTCAP ~ as.factor(MONSEX) + BASEHI + as.factor(IS924C) +
as.factor(NEWCNVTN) + as.factor(MONACCEP) + as.factor(HISPORIG) +
as.factor(COMBDRG2) + logWGT1, data = train)
summary(final_model)
##
## Call:
## lm(formula = SENTTCAP ~ as.factor(MONSEX) + BASEHI + as.factor(IS924C) +
## as.factor(NEWCNVTN) + as.factor(MONACCEP) + as.factor(HISPORIG) +
## as.factor(COMBDRG2) + logWGT1, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -193.62 -41.62 -5.30 42.64 275.62
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 35.51408 20.10125 1.767 0.077727 .
## as.factor(MONSEX)1 -23.82958 11.89806 -2.003 0.045603 *
## BASEHI 3.52117 0.93011 3.786 0.000167 ***
## as.factor(IS924C)Yes 44.35615 7.74228 5.729 1.53e-08 ***
## as.factor(NEWCNVTN)Trial 86.97778 24.63863 3.530 0.000444 ***
## as.factor(MONACCEP)-2 15.53321 9.74883 1.593 0.111559
## as.factor(MONACCEP)0 67.53275 23.93683 2.821 0.004926 **
## as.factor(HISPORIG)1 -5.08015 14.23482 -0.357 0.721294
## as.factor(HISPORIG)2 -20.56130 15.55303 -1.322 0.186619
## as.factor(COMBDRG2)2 2.29351 11.24384 0.204 0.838432
## as.factor(COMBDRG2)3 -0.07739 10.35100 -0.007 0.994037
## as.factor(COMBDRG2)4 -15.12616 27.34088 -0.553 0.580283
## as.factor(COMBDRG2)6 16.07370 12.21920 1.315 0.188813
## logWGT1 1.55209 2.56648 0.605 0.545547
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 61.68 on 665 degrees of freedom
## Multiple R-squared: 0.3932, Adjusted R-squared: 0.3813
## F-statistic: 33.15 on 13 and 665 DF, p-value: < 2.2e-16
#Weight and weapon chargers
#Assumption Plots
#Plot graphs individually
library(here)
library(dplyr)
library(ggplot2)
library(ggpubr)
library(ggfortify)
library(MASS)
library(lindia)
library(olsrr)
#Ensuring Model Meets Assumptions
autoplot(final_model,which=1, ncol = 1, label.size = 3) + theme_bw()#Residual vs. Fitted
autoplot(final_model,which=2, ncol = 1, label.size = 3) + theme_bw() #QQ
autoplot(final_model, which=4, ncol = 1, label.size = 3) + theme_bw() #Cook's distanc
#All points are below .5
#Multicolinearity Test
library(car)
vif(final_model) #All Under 10
## GVIF Df GVIF^(1/(2*Df))
## as.factor(MONSEX) 1.032887 1 1.016310
## BASEHI 7.932103 1 2.816399
## as.factor(IS924C) 1.038758 1 1.019195
## as.factor(NEWCNVTN) 6.005971 1 2.450708
## as.factor(MONACCEP) 10.735437 2 1.810110
## as.factor(HISPORIG) 1.060299 2 1.014746
## as.factor(COMBDRG2) 4.423874 4 1.204274
## logWGT1 5.783147 1 2.404817
#Residual Plot
res <- resid(final_model)
plot(fitted(final_model), res)
abline(0,0) #As random as we can get for real data
## Comparing Linear Models Using Metrics
#R-Squared
summary(trial)$adj.r.squared
## [1] 0.376496
summary(trial_step)$adj.r.squared #Highest
## [1] 0.3810688
summary(final_model)$adj.r.squared #Very close to trial_step
## [1] 0.3813235
#AIC
AIC(trial)
## [1] 7558.348
AIC(trial_step)#Lowest
## [1] 7535.841
AIC(final_model)#Very close to trial_step
## [1] 7540.475
#BIC
BIC(trial)
## [1] 7684.925
BIC(trial_step)#Lowest
## [1] 7581.047
BIC(final_model)#Very close to trial_step
## [1] 7608.285
#Predictions
trial_predictions = predict(trial, newdata = test)
trial_step_predictions = predict(trial_step, newdata = test)
final_model_predictions = predict(final_model,newdata = test)
sqrt(mean((trial_predictions - test$SENTTCAP)^2)) / sd(test$SENTTCAP)
## [1] 0.7951826
sqrt(mean((trial_step_predictions- test$SENTTCAP)^2)) / sd(test$SENTTCAP)
## [1] 0.8032512
sqrt(mean((final_model_predictions - test$SENTTCAP)^2)) / sd(test$SENTTCAP)
## [1] 0.7959385
We can see that our backwards step regression has the lowest AIC and BIC as well as the highest r-squared out performing the other models with the metrics we used to pick our model. However the metrics are very similar in the backwards step and the third model which adds two variables that are used to calculate sentence length. Therefore we are choosing the third model as our final linear regression model. We will be looking at the summary of that model to see which variables are significant.
library(rpart)
model = rpart(SENTTCAP ~ AGE + MONSEX + MONRACE + NEWEDUC + logWGT1 + BASEHI + IS924C + NEWCNVTN + CITIZEN + MONACCEP + HISPORIG, data = train)
plot(model, compress = TRUE)
text(model, cex = 0.7, use.n = TRUE, fancy = FALSE, all = TRUE)
summary(model)
## Call:
## rpart(formula = SENTTCAP ~ AGE + MONSEX + MONRACE + NEWEDUC +
## logWGT1 + BASEHI + IS924C + NEWCNVTN + CITIZEN + MONACCEP +
## HISPORIG, data = train)
## n= 679
##
## CP nsplit rel error xerror xstd
## 1 0.21562933 0 1.0000000 1.0016502 0.07613749
## 2 0.07476488 1 0.7843707 0.7921686 0.04812891
## 3 0.03210995 2 0.7096058 0.7383133 0.04656169
## 4 0.02117246 3 0.6774958 0.7185046 0.04565492
## 5 0.01799232 4 0.6563234 0.6907711 0.04091291
## 6 0.01286577 5 0.6383311 0.7238492 0.04708320
## 7 0.01079762 6 0.6254653 0.7223742 0.04736420
## 8 0.01000000 7 0.6146677 0.7331055 0.04850992
##
## Variable importance
## NEWCNVTN MONACCEP BASEHI logWGT1 IS924C AGE MONRACE MONSEX
## 33 29 19 8 5 3 2 1
##
## Node number 1: 679 observations, complexity param=0.2156293
## mean=149.9564, MSE=6140.975
## left son=2 (639 obs) right son=3 (40 obs)
## Primary splits:
## NEWCNVTN splits as LR, improve=0.21562930, (0 missing)
## MONACCEP < -1 to the left, improve=0.20022640, (0 missing)
## BASEHI < 23 to the left, improve=0.08837669, (0 missing)
## logWGT1 < 2.915161 to the left, improve=0.05904638, (0 missing)
## IS924C splits as LR, improve=0.04446148, (0 missing)
## Surrogate splits:
## MONACCEP < -1 to the left, agree=0.99, adj=0.825, (0 split)
##
## Node number 2: 639 observations, complexity param=0.07476488
## mean=140.852, MSE=4469.77
## left son=4 (361 obs) right son=5 (278 obs)
## Primary splits:
## BASEHI < 29 to the left, improve=0.10914870, (0 missing)
## logWGT1 < 2.915161 to the left, improve=0.06693642, (0 missing)
## MONACCEP < -2.5 to the right, improve=0.04564653, (0 missing)
## IS924C splits as LR, improve=0.02959730, (0 missing)
## MONRACE < 1.5 to the right, improve=0.01019195, (0 missing)
## Surrogate splits:
## logWGT1 < 5.490483 to the left, agree=0.750, adj=0.424, (0 split)
## MONRACE < 1.5 to the right, agree=0.645, adj=0.183, (0 split)
## MONSEX < 0.5 to the left, agree=0.588, adj=0.054, (0 split)
## AGE < 59.5 to the left, agree=0.570, adj=0.011, (0 split)
## CITIZEN < 1.5 to the left, agree=0.567, adj=0.004, (0 split)
##
## Node number 3: 40 observations, complexity param=0.01079762
## mean=295.3995, MSE=10360.62
## left son=6 (32 obs) right son=7 (8 obs)
## Primary splits:
## BASEHI < 31 to the left, improve=0.10864000, (0 missing)
## IS924C splits as LR, improve=0.10747810, (0 missing)
## AGE < 31.5 to the left, improve=0.07504421, (0 missing)
## logWGT1 < 3.598495 to the left, improve=0.05090463, (0 missing)
## NEWEDUC splits as LRL-, improve=0.04295453, (0 missing)
## Surrogate splits:
## logWGT1 < 7.047279 to the left, agree=0.825, adj=0.125, (0 split)
##
## Node number 4: 361 observations, complexity param=0.03210995
## mean=121.469, MSE=3680.832
## left son=8 (321 obs) right son=9 (40 obs)
## Primary splits:
## IS924C splits as LR, improve=0.10076130, (0 missing)
## BASEHI < 17 to the left, improve=0.06494533, (0 missing)
## logWGT1 < 1.641826 to the left, improve=0.06012477, (0 missing)
## MONACCEP < -2.5 to the right, improve=0.03883086, (0 missing)
## AGE < 48.5 to the right, improve=0.03626606, (0 missing)
## Surrogate splits:
## AGE < 22.5 to the right, agree=0.895, adj=0.05, (0 split)
##
## Node number 5: 278 observations, complexity param=0.02117246
## mean=166.022, MSE=4372.857
## left son=10 (196 obs) right son=11 (82 obs)
## Primary splits:
## BASEHI < 33 to the left, improve=0.072622030, (0 missing)
## logWGT1 < 5.720952 to the left, improve=0.038467160, (0 missing)
## AGE < 29.5 to the right, improve=0.010684270, (0 missing)
## HISPORIG < 0.5 to the right, improve=0.006375651, (0 missing)
## MONSEX < 0.5 to the right, improve=0.006002461, (0 missing)
## Surrogate splits:
## logWGT1 < 6.929611 to the left, agree=0.781, adj=0.256, (0 split)
## AGE < 64.5 to the left, agree=0.716, adj=0.037, (0 split)
##
## Node number 6: 32 observations
## mean=278.6247, MSE=10139.18
##
## Node number 7: 8 observations
## mean=362.4987, MSE=5618.481
##
## Node number 8: 321 observations, complexity param=0.01799232
## mean=114.6708, MSE=3025.351
## left son=16 (125 obs) right son=17 (196 obs)
## Primary splits:
## BASEHI < 20.5 to the left, improve=0.07725268, (0 missing)
## logWGT1 < 2.73354 to the left, improve=0.05644127, (0 missing)
## AGE < 53.5 to the right, improve=0.03951030, (0 missing)
## MONACCEP < -2.5 to the right, improve=0.03723461, (0 missing)
## HISPORIG < 1.5 to the right, improve=0.02027234, (0 missing)
## Surrogate splits:
## logWGT1 < 2.779371 to the left, agree=0.866, adj=0.656, (0 split)
## MONACCEP < -2.5 to the right, agree=0.801, adj=0.488, (0 split)
## AGE < 29.5 to the left, agree=0.639, adj=0.072, (0 split)
## NEWEDUC splits as LRRR, agree=0.623, adj=0.032, (0 split)
##
## Node number 9: 40 observations
## mean=176.025, MSE=5593.824
##
## Node number 10: 196 observations
## mean=154.4956, MSE=3507.451
##
## Node number 11: 82 observations, complexity param=0.01286577
## mean=193.573, MSE=5364.763
## left son=22 (65 obs) right son=23 (17 obs)
## Primary splits:
## AGE < 43.5 to the left, improve=0.121949100, (0 missing)
## logWGT1 < 8.628586 to the left, improve=0.047653750, (0 missing)
## NEWEDUC splits as LLR-, improve=0.012566850, (0 missing)
## MONRACE < 1.5 to the right, improve=0.007142106, (0 missing)
## BASEHI < 37 to the left, improve=0.007119122, (0 missing)
## Surrogate splits:
## MONRACE < 3 to the left, agree=0.817, adj=0.118, (0 split)
## CITIZEN < 1.5 to the left, agree=0.805, adj=0.059, (0 split)
##
## Node number 16: 125 observations
## mean=95.52744, MSE=2477.31
##
## Node number 17: 196 observations
## mean=126.8795, MSE=2992.096
##
## Node number 22: 65 observations
## mean=180.4923, MSE=4090.742
##
## Node number 23: 17 observations
## mean=243.5876, MSE=7080.329
library(tree)
set.seed(2)
tree <- tree(SENTTCAP ~ AGE + MONSEX + MONRACE + NEWEDUC + logWGT1 + BASEHI + IS924C + NEWCNVTN + CITIZEN + MONACCEP + HISPORIG, data = train)
?tree
plot(tree)
text(tree, cex=0.7)
cv.filter2019 = cv.tree(tree, FUN = prune.tree)
cv.filter2019
## $size
## [1] 9 7 6 5 4 3 2 1
##
## $dev
## [1] 2987906 2984479 2984300 2896480 2922575 3173075 3323431 4258765
##
## $k
## [1] -Inf 51988.23 53646.70 75022.99 88283.27 133889.56 311748.76
## [8] 899114.37
##
## $method
## [1] "deviance"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
plot(cv.filter2019$size, cv.filter2019$dev, type='b')
title(main = "Deviance VS Tree Size")
prune.filter2019 = prune.tree(tree, best = 5)
plot(prune.filter2019)
text(prune.filter2019, cex=0.6)
#Predicted variables for Tree
yhat_tree = predict(tree, newdata = test)
?predict
plot(yhat_tree, test$SENTTCAP)
sqrt(mean((yhat_tree - test$SENTTCAP)^2)) / sd(test$SENTTCAP)
## [1] 0.7921585
#Predicted Values for Pruned Down Tree
yhat = predict(prune.filter2019, newdata = test)
plot(yhat, test$SENTTCAP)
sqrt(mean((yhat - test$SENTTCAP)^2)) / sd(test$SENTTCAP)
## [1] 0.7927329